	/**************************************************************************
	* Step 1: Read the raw file
	***************************************************************************/

	data C5597    ;
	    infile "/.../F5500/FOIAed/1997/C5597.txt" lrecl=3105; 
	    input 
		/*redacted*/
	run;

	data F5597    ;
	    infile "/.../F5500/FOIAed/1997/F5597.txt" lrecl=4142; 
	    input 
		/*redacted*/
	run;

	data R5597    ;
	    infile "/.../F5500/FOIAed/1997/R5597.txt" lrecl=1935; 
	    input 
		/*redacted*/
	run;


	/*Stack each file*/
	data raw97;
	  set c5597 f5597 r5597;
	run;

	/**************************************************************************
	* Step 2: clean the raw file and make variable names consistent
	***************************************************************************/

	data raw97
		(keep = DB
			DC
			cash_bal
			xDB
			plan_eff_year
			plan_qtr_strt
			plan_year_strt
			plan_qtr_end
			plan_year_end
			form_year
			filing_id
			filing_id_char
			adj_active
			collective_bargain_ind
			opr_ein
			opr_pn
			partcp_account_bal_cnt_n
			pension_benefit_plan_ind
			tot_active_partcp_cnt_n
			type_pension_bnft_code
			type_plan_entity_ind
			tot_income_amt_n
			tot_contrib_amt_n
			emplr_contrib_income_amt_n
			employee_contrib_income_amt_n
			t);					/*this is used to de-duplicate*/													
		set raw97 (rename = (COLLECTIVE_BARGN_IND=collective_bargain_ind
			  PENSION_FEATURES_CD = type_pension_bnft_code));
		if  TYPE_PENSION_BNFT_PLAN_IND in /*redacted*/ then DB=/*redacted*/;					/*has DB characteristics*/
			else DB=/*redacted*/;
		if TYPE_PENSION_BNFT_PLAN_IND=/*redacted*/ then cash_bal=/*redacted*/;
			else cash_bal=/*redacted*/;
		if DB=/*redacted*/ & cash_bal=/*redacted*/ then xDB=/*redacted*/;
			else xDB=/*redacted*/;
		if  TYPE_PENSION_BNFT_PLAN_IND in /*redacted*/ then DC=/*redacted*/;	/*has DC characteristics*/
			else DC=/*redacted*/;
		plan_name = upcase(plan_name);
		if /*redacted*/or 
		/*redacted*/ then supplemental=/*redacted*/;	/*is supplemental*/
			else supplemental=/*redacted*/;
		if supplemental=/*redacted*/ then delete;	/*drop supplemental plans*/

		tot_active_partcp_cnt_n=/*redacted*/;
		if tot_active_partcp_cnt_n=. then tot_active_partcp_cnt_n = /*redacted*/;
		partcp_account_bal_cnt_n=/*redacted*/;
	
		form_plan_year_begin_date = left(trim(form_plan_year_begin_date));
		fpy_begin_yr = /*redacted*/; 	/*plan start year*/
		fpy_begin_mth = /*redacted*/; 	/*plan start month*/
		fpy_begin_day = /*redacted*/;	/*plan start day*/
		fpy_date = mdy(fpy_begin_mth,fpy_begin_day,fpy_begin_yr);		/*plan start sas date*/
		plan_qtr_strt = qtr(fpy_date);						/*plan calendar start qtr*/
		plan_year_strt = year(fpy_date);					/*plan calendar start year*/

		form_tax_prd = left(trim(FORM_PLAN_YEAR_END_DATE)); 
		ftp_yr = /*redacted*/; 							/*plan end year*/
		ftp_mth = /*redacted*/; 						/*plan end month*/
		ftp_day = /*redacted*/;							/*plan end day*/
		ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
		plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
		plan_year_end = year(ftp_date);						/*plan calendar end year*/

		form_year = /*redacted*/;						/*rename for consistency with research file*/
		filing_id_char = dln;						/*string filing id=DLN*/
		filing_id = /*redacted*/;				/*numeric filing id*/
		
		adj_active = tot_active_partcp_cnt_n;				/*no editing for the raw file*/
		opr_ein = left(trim(spons_dfe_ein));				/*rename for consistency with research file*/
		opr_pn = left(trim(spons_dfe_pn));				/*rename for consistency with research file*/
		len_pn = length(opr_pn);					/*length of pn field*/
		if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
		else if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
		/*Redacted: removing invalid EIN/PNs*/

		plan_eff_year = /*redacted*/;		/*year that the plan started*/	
		type_plan_entity_ind = left(trim(type_plan_entity_ind));
		if type_plan_entity_ind^=/*redacted*/ then delete;			/*keep only single employer plans*/
		if DB=/*redacted*/ and DC=/*redacted*/ then delete; 					/*remove welfare plans*/
		pension_benefit_plan_ind = /*redacted*/;					/*all plans are now pension plans*/
		t = /*redacted*/;								/*helper used for de-duplication step*/
	run;

	/*Set variable lengths to enable error-free appending with the research file*/
	data raw97;
	
		/*redacted*/
		
		set raw97;
	run;

	/**************************************************************************
	* Step 3: De-duplicate the file 
	***************************************************************************/
	
	proc sort data = raw97;
		by opr_ein opr_pn plan_year_end;
	run;
	
	proc means noprint data = raw97;
		by opr_ein opr_pn plan_year_end;
		var filing_id;
		output out = best_filing 
		max(filing_id) = max_filing_id;
	run;
	
	proc sql;
		create table clean97 as
		select a.*, b.max_filing_id
		from raw97 as a inner join
		best_filing as b
		on a.filing_id = b.max_filing_id;
	quit;

	data  clean97;
		set clean97 (drop= t max_filing_id);
		source_id = /*redacted*/; /*id for raw file*/
	run;

	/*Remove exact filing_id dups*/
	proc sort data = clean97 nodupkey;
	      by opr_ein opr_pn plan_year_end filing_id_char;
	run;

	/**************************************************************************
	* Step 4: Count up # DB plans per EIN
	***************************************************************************/
	
	proc sort data = clean97;
		by opr_ein;
	run;
	
	proc means noprint data = clean97;
		by opr_ein;
		var DB;				
		output out = plan_count97
			sum(DB) = num_DB_plans;
	run;
	
	/*Check how many participants are in firms with 2+ DB plans*/
	proc sql;
		create table pens97_plncnt as 
		select *
		from clean97 as t,
		plan_count97 as m
		where t.opr_ein = m.opr_ein;
	quit;
		
	data blue.pensplan_samp97 (rename = 
	     (tot_active_partcp_cnt_n=tot_active_partcp_cnt
	      partcp_account_bal_cnt_n=partcp_account_bal_cnt));
		set pens97_plncnt
		(drop = _type_ _freq_ xDB);
		adj_DB_partcp = DB*adj_active;
		DB_partcp = DB*tot_active_partcp_cnt_n;
	run;

	/*De-dup small number of cases where filing errors create EIN-PN_Plan year end dups
	  i.e. the filing_id's are different but the plan years are mis-aligned*/
	proc sort data = blue.pensplan_samp97 nodupkey;
	  by opr_ein opr_pn plan_year_end;
	run;
	
	/*Clean up the working directory*/
	proc datasets lib=work nolist kill;
	quit;
	run;
